import sentencepiece as spm
from ai.audio_dataset import build_vocab


def do_test():
    jsonl_file_path = './data/clean_data.jsonl'
    with open('./output.txt', mode='w', encoding='utf-8') as f:
        build_vocab.dump_text_manifest(f, jsonl_file_path, key='comment')

    # 定义模型的参数
    spm.SentencePieceTrainer.Train(
        input='output.txt',
        vocab_size=50000,
        model_type='bpe',
        model_prefix='gxlmodel',
        input_sentence_size=100000000,
        character_coverage=0.995
    )

    # 创建模型
    sp = spm.SentencePieceProcessor()
    sp.load('mymodel.model')
    # 使用模型进行分词
    result = sp.encode('你好, 你是谁, 我是更雪龙', out_type=str)
    print(result)

if __name__=='__main__':
    """"""
    do_test()